import pandas as pd
from config import config
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import os
import json
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
plt.rcParams['figure.figsize'] = [7, 7]
# create cache directory
os.makedirs('cache', exist_ok=True)
please run "all.sh" before executing this notebook!
Business Understanding
config[-1]
{'year': '2020',
'url': 'https://drive.google.com/uc?id=1dfGerWeWkcyQ9GX9x20rdSGj7WtEpzBB&export=download',
'packed_path': 'data/2020.zip',
'unpack_path': 'data/unpack/2020.zip',
'data_path': 'data/unpack/2020.zip/survey_results_public.csv',
'json_path': 'data/2020.json',
'numeric_columns': ['Age',
'Age1stCode',
'YearsCode',
'YearsCodePro',
'WorkWeekHrs'],
'leave_columns': ['CompTotal', 'Respondent', 'ConvertedComp'],
'exclusive_columns': ['CompFreq',
'Country',
'CurrencyDesc',
'CurrencySymbol',
'EdLevel',
'Employment',
'Ethnicity',
'Hobbyist',
'JobSat',
'JobSeek',
'MainBranch',
'NEWDevOps',
'NEWDevOpsImpt',
'NEWEdImpt',
'NEWLearn',
'NEWOffTopic',
'NEWOnboardGood',
'NEWOtherComms',
'NEWOvertime',
'NEWPurpleLink',
'NEWSOSites',
'OpSys',
'OrgSize',
'PurchaseWhat',
'Sexuality',
'SOAccount',
'SOComm',
'SOPartFreq',
'SOVisitFreq',
'SurveyEase',
'SurveyLength',
'Trans',
'UndergradMajor',
'WelcomeChange']}
df_raw = pd.read_csv(config[-1]['data_path'], dtype=str)
salary encoded here as ConvertedComp - its in USD
len(df_raw)
64461
df_raw.columns
Index(['Respondent', 'MainBranch', 'Hobbyist', 'Age', 'Age1stCode', 'CompFreq',
'CompTotal', 'ConvertedComp', 'Country', 'CurrencyDesc',
'CurrencySymbol', 'DatabaseDesireNextYear', 'DatabaseWorkedWith',
'DevType', 'EdLevel', 'Employment', 'Ethnicity', 'Gender', 'JobFactors',
'JobSat', 'JobSeek', 'LanguageDesireNextYear', 'LanguageWorkedWith',
'MiscTechDesireNextYear', 'MiscTechWorkedWith',
'NEWCollabToolsDesireNextYear', 'NEWCollabToolsWorkedWith', 'NEWDevOps',
'NEWDevOpsImpt', 'NEWEdImpt', 'NEWJobHunt', 'NEWJobHuntResearch',
'NEWLearn', 'NEWOffTopic', 'NEWOnboardGood', 'NEWOtherComms',
'NEWOvertime', 'NEWPurchaseResearch', 'NEWPurpleLink', 'NEWSOSites',
'NEWStuck', 'OpSys', 'OrgSize', 'PlatformDesireNextYear',
'PlatformWorkedWith', 'PurchaseWhat', 'Sexuality', 'SOAccount',
'SOComm', 'SOPartFreq', 'SOVisitFreq', 'SurveyEase', 'SurveyLength',
'Trans', 'UndergradMajor', 'WebframeDesireNextYear',
'WebframeWorkedWith', 'WelcomeChange', 'WorkWeekHrs', 'YearsCode',
'YearsCodePro'],
dtype='object')
The data of stackoverflow contains a lot of columns (61 in total). Most of the data is a string of a set of possible answers. Only a small amount contains floating point values. In total, we have 64461 answers, where about 53.9% (34756) contain an answer related to their current job salary, less than we got compared to the job satisfaction we analysed at the beginning of this year (70%).
df = df_raw[~ df_raw['ConvertedComp'].isnull()]
float(len(df)) / len(df_raw)
0.5391787282232667
# del df_raw
len(df)
34756
df['CompTotal'].describe()
count 34756 unique 2997 top 1e+05 freq 767 Name: CompTotal, dtype: object
set(df['CompFreq'])
{'Monthly', 'Weekly', 'Yearly'}
We have a total 2997 unique values. But this values can be references as 'Monthly', 'Weekly' and 'Yearly' - that needs to be considered then doing the data preparation.
All rows that don't have a salary defined, will where removed from the dataset. I removed as well all data points that are by 3 std-dev away from the mean, resulting in the drop of 3241 records.
I created new columns out of the existings ones in order to extract the features we want to analyse. This was done for all columns.
def load(column:str, year:int) -> pd.DataFrame:
'''
Calculate features and cache result. Further calls will return the precalculated results
:param df: (pd.DataFrame) The DataFrame with the data to be processed
:param column: (str) The column with the data to process
:param year: (int) the year of the data, used as cache key
:returns: go.Figure, ready to use plotly figure
'''
cache_key = f"{year}_{column}.json"
cache_path = os.path.join('data', 'features', cache_key)
if os.path.exists(cache_path):
return pd.read_json(cache_path, lines=True)
def plot_distribution(column:str, year:int, sort_by:str='values') -> go.Figure:
'''
Plots the sum of the positives of a partical column with multilable values.
:param df: (pd.DataFrame) The DataFrame with the data to be plotted
:param column: (str) The column with the data to plot
:returns: go.Figure, ready to use plotly figure
'''
df_cached = load(column, year)
if "CompTotal" in df_cached.columns:
df_cached = df_cached.drop("CompTotal", axis=1)
if "Respondent" in df_cached.columns:
df_cached = df_cached.drop("Respondent", axis=1)
if sort_by == 'values':
df_sum = df_cached.sum().to_frame()
df_sorted = df_sum.sort_values(by=0, ascending=True)
y = list(map(lambda x: x.split('_')[-1], df_sorted.index))
x = df_sorted.values.flatten()
fig = go.Figure(data=[go.Bar(x=x, y=y, text=y, orientation='h', )])
elif sort_by == 'numeric':
col = df_cached.columns
real_col = list(filter(lambda x: 'NA' not in x, col))[0]
na_col = list(filter(lambda x: 'NA' in x, col))[0]
x = df_cached[df_cached[na_col].isnull()][real_col]
fig = go.Figure(data=[go.Histogram(x=x, bingroup=100) ])
return fig
"""
Loads the calculated max values (after outlier cleaning)
"""
with open(os.path.join('data', 'meta', 'max.json'), 'rt') as max_reader:
max_values = json.loads(max_reader.read())
When we have a column with a numeric value, we can directly use it as a feature. No scaling is done as I assume that the NN is able to handle the unscaled data correctly. If a value is nan, we extract it in a new column naming it <column_name>_NA. This prevents the deletion of the entire row only because (some) information is missing. Nan column are not shown here.
This column reflect the number of hours worked.
plot_distribution('WorkWeekHrs', 2020, 'numeric').show()
Scaled output of the distrubution. We still have a imbalanced distribution even after removing the outliers. I would as well consider 94 hours of work a really large value.
max_values['WorkWeekHrs']
94.23067483393056
This column reflect the number of age of the user.
plot_distribution('Age', 2020, 'numeric').show()
Most participants are about 25 year old. After Scaling and removing the outliers, we have a max value for age at 59. The first bucket is the biggest one, assuming most people are really young when visiting stack overflow.
max_values['Age']
59.58997142020077
This column reflect the Age that the user was, as they started to code.
plot_distribution('Age1stCode', 2020, 'numeric').show()
max_values['Age1stCode']
30.8186816615149
A more gausian distribution here - most people start at 15 years with coding.
This column reflect the number of Years that the user code.
plot_distribution('YearsCode', 2020, 'numeric').show()
max_values['YearsCode']
41.25377132014002
Most users have 10 years of coding experience in general.
This column reflect the number of years of coding in a professional environment.
plot_distribution('YearsCodePro', 2020, 'numeric').show()
Most of the users have a quite low amount of coding experience in a professional environment. This can be an indicator that they are more likley to use stack overflow than a more experience developer - they can read up directly the documentation of the particular framework.
This will give a inside on the categorical data we found. Some answers are exclusive, for example only one company size could be seleted. But for some other answers, entire sets could be selected, like in the used programming languages. Both sources where one-hot encoded.
This column reflect the payment frequency.
plot_distribution('CompFreq', 2020, 'values').show()
19107 yearly salary information, 14680 Monthly and 969 Weekly. Most don't respond to the answer.
plot_distribution('Country', 2020).show()
We have a lot on countries with a low amount of responses. Its possible that they get removed when we split on train and test set. 12k answers from the USA than with fast dropping distributions.
As we want to predict the salary, we use the ConvertedComp column in the data. The payment frequency and the different currencies have been converted to one base and allow a better comparision. From the stack overflow schema file:
ConvertedComp,"Salary converted to annual USD salaries using the exchange rate on 2020-02-19, assuming 12 | working months and 50 working weeks."
label_df = pd.read_json(os.path.join("data", "features", "2020_Age.json"), lines=True)
label_df.columns
Index(['Respondent', 'Age_NA', 'CompTotal', 'ConvertedComp', 'Age'], dtype='object')
col = 'ConvertedComp'
x = label_df[~ label_df[col].isnull()][col]
fig = go.Figure(data=[go.Histogram(x=x, bingroup=100) ])
fig.show()
max_values['ConvertedComp']
579262.7337095363
Most have a low annual income with 10k. But this may be a lot amount of money, depending on the geographical position of the user. After 200k the amount of items in the bucket gets low - only a few users have such high amount of income. When looking on the removed outlier data above, the max values gets moved to 580k - still a lot of money. In the past I may consider removing even more outliers.
To reduce the required memory (and the amount of features the user have to provide for a prediction), I trained on each column a simple regression model using tensorflow 2.0 (with keras). Then the best 15 models where combined together to produce the final model (see shared/train.py in the repository for details).
I used Mean Squared Error as metric to compare the models. The best single model used the country feature - not a big suprise, as most of the data was based for US people and the the salary should be high their as well.
evaluation = []
for f in os.listdir(os.path.join("data", "metrics")):
with open(os.path.join("data", "metrics", f), 'rt') as metric_reader:
data = json.loads(metric_reader.read())
data['file'] = f
evaluation.append(data)
pd.DataFrame(evaluation).sort_values('val_mean_squared_error')[['file', 'val_mean_squared_error', ]][0:16]
| file | val_mean_squared_error | |
|---|---|---|
| 28 | 2020_Country.json | [0.009920666925609112] |
| 19 | 2020_CurrencyDesc.json | [0.010503113269805908] |
| 6 | 2020_CurrencySymbol.json | [0.010630987584590912] |
| 53 | 2020_YearsCode.json | [0.010961346328258514] |
| 5 | 2020_YearsCodePro.json | [0.012053127400577068] |
| 29 | 2020_WebframeWorkedWith.json | [0.012106635607779026] |
| 9 | 2020_CompFreq.json | [0.012175439856946468] |
| 44 | 2020_WelcomeChange.json | [0.012430550530552864] |
| 39 | 2020_NEWSOSites.json | [0.01255364716053009] |
| 27 | 2020_Ethnicity.json | [0.01259028259664774] |
| 7 | 2020_JobSeek.json | [0.012596573680639267] |
| 11 | 2020_Age.json | [0.012640145607292652] |
| 2 | 2020_Sexuality.json | [0.012656370177865028] |
| 33 | 2020_OpSys.json | [0.01281856931746006] |
| 31 | 2020_JobSat.json | [0.012859459035098553] |
| 52 | 2020_NEWCollabToolsWorkedWith.json | [0.012873232364654541] |
The order of the quality of the features, top 15, sorted by mean squared error on the validation (test) set. This is the same order the user have to provide information for a custom prediction.
To have this time something to deploy as well, I packed the keras model into tensorflow js and serve it via tensorflow js on my blog. Its all rendered and predicted in the browser, to make sure the user is not scared in putting (sensitive) data into application.
!jupyter nbconvert --execute --to html analysis.ipynb
[NbConvertApp] Converting notebook analysis.ipynb to html
Traceback (most recent call last):
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbformat/reader.py", line 14, in parse_json
nb_dict = json.loads(s, **kwargs)
File "/Users/d.murawski/.pyenv/versions/3.8.6/lib/python3.8/json/__init__.py", line 357, in loads
return _default_decoder.decode(s)
File "/Users/d.murawski/.pyenv/versions/3.8.6/lib/python3.8/json/decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "/Users/d.murawski/.pyenv/versions/3.8.6/lib/python3.8/json/decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/bin/jupyter-nbconvert", line 8, in <module>
sys.exit(main())
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/jupyter_core/application.py", line 254, in launch_instance
return super(JupyterApp, cls).launch_instance(argv=argv, **kwargs)
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/traitlets/config/application.py", line 845, in launch_instance
app.start()
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbconvert/nbconvertapp.py", line 346, in start
self.convert_notebooks()
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbconvert/nbconvertapp.py", line 518, in convert_notebooks
self.convert_single_notebook(notebook_filename)
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbconvert/nbconvertapp.py", line 483, in convert_single_notebook
output, resources = self.export_single_notebook(notebook_filename, resources, input_buffer=input_buffer)
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbconvert/nbconvertapp.py", line 412, in export_single_notebook
output, resources = self.exporter.from_filename(notebook_filename, resources=resources)
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbconvert/exporters/exporter.py", line 181, in from_filename
return self.from_file(f, resources=resources, **kw)
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbconvert/exporters/exporter.py", line 199, in from_file
return self.from_notebook_node(nbformat.read(file_stream, as_version=4), resources=resources, **kw)
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbformat/__init__.py", line 143, in read
return reads(buf, as_version, **kwargs)
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbformat/__init__.py", line 73, in reads
nb = reader.reads(s, **kwargs)
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbformat/reader.py", line 58, in reads
nb_dict = parse_json(s, **kwargs)
File "/Users/d.murawski/.local/share/virtualenvs/stackoverflowcapstone-K84ddafS/lib/python3.8/site-packages/nbformat/reader.py", line 17, in parse_json
raise NotJSONError(("Notebook does not appear to be JSON: %r" % s)[:77] + "...") from e
nbformat.reader.NotJSONError: Notebook does not appear to be JSON: ''...